This is an initial report as a test to analyze the causality of variables of development projects (GitHub) and quality characteristics of the software (Sonar Cloud). The data has been obtained through the public api of both platforms, and the json data has been filtered and pre-processed using an intermediate Mongo documentary database.
The result of the preprocessing has been stored in a CSV file. The first step is to import the data from that file.
library(readr)
sonar_git <- read_csv("../data/sonar-git.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## project = col_character(),
## version = col_character(),
## from = col_datetime(format = ""),
## to = col_datetime(format = ""),
## file_complexity_distribution = col_character(),
## function_complexity_distribution = col_character(),
## alert_status = col_character()
## )
## See spec(...) for full column specifications.
Then, we filter our the matrix (with 107 variables) to manage those interesting (after some preliminary analyises were done). With the filtered data, we sown the descriptive statistics.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dataset0<-select(sonar_git,
project,
version,
commits,
committers,
changes_by_commit,
committers_weight,
bugs,
code_smells,
complexity,
violations,
duplicated_lines,
open_issues,
lines)
#Filter missing values
dataset0 <- dataset0[-c(133:190, 172, 173, 195:205, 212:228, 233:245, 249:256), ]
dataset1 <- dataset0
dataset1 <- within(dataset1, commits <- commits/(lines/1000))
dataset1 <- within(dataset1, committers <- committers/(lines/1000))
dataset1 <- within(dataset1, commits <- commits/(lines/1000))
dataset1 <- within(dataset1, bugs <- bugs/(lines/1000))
dataset1 <- within(dataset1, code_smells <- code_smells/(lines/1000))
dataset1 <- within(dataset1, violations <- violations/(lines/1000))
dataset1 <- within(dataset1, duplicated_lines <- duplicated_lines/(lines/1000))
dataset1 <- within(dataset1, open_issues <- open_issues/(lines/1000))
dataset1 <- dataset1[, 1:12]
summary(dataset1)
## project version commits committers
## Length:156 Length:156 Min. :0.000000 Min. :0.00000
## Class :character Class :character 1st Qu.:0.001144 1st Qu.:0.02349
## Mode :character Mode :character Median :0.006117 Median :0.05650
## Mean :0.023112 Mean :0.08222
## 3rd Qu.:0.022821 3rd Qu.:0.11896
## Max. :0.358338 Max. :0.32386
## changes_by_commit committers_weight bugs code_smells
## Min. : 0.00 Min. :0.00000 Min. :0.00000 Min. : 0.000
## 1st Qu.: 48.84 1st Qu.:0.02117 1st Qu.:0.00000 1st Qu.: 2.174
## Median : 178.56 Median :0.06510 Median :0.07789 Median : 5.738
## Mean : 946.46 Mean :0.23019 Mean :0.31509 Mean : 12.283
## 3rd Qu.: 428.46 3rd Qu.:0.23180 3rd Qu.:0.63989 3rd Qu.: 17.862
## Max. :34902.00 Max. :1.00000 Max. :1.38801 Max. :106.491
## complexity violations duplicated_lines open_issues
## Min. : 1026 Min. : 0.6145 Min. : 2.548 Min. : 0.6145
## 1st Qu.: 2580 1st Qu.: 2.5552 1st Qu.: 8.982 1st Qu.: 2.5513
## Median : 4026 Median : 6.4159 Median : 14.523 Median : 5.3882
## Mean : 12043 Mean : 14.1213 Mean : 26.695 Mean : 13.8915
## 3rd Qu.: 11648 3rd Qu.: 24.5530 3rd Qu.: 28.229 3rd Qu.: 24.5530
## Max. :143551 Max. :106.4907 Max. :157.694 Max. :106.4907
First we analyse commits/committers relationship
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p <- ggplot(dataset1, aes(commits, project)) +
geom_boxplot(fill="gray")
p
p <- ggplot(dataset1, aes(committers, project)) +
geom_boxplot(fill="gray")
p
sp <- ggplot(dataset1, aes(x=commits, y=committers)) +
geom_point(aes(colour=project)) +
stat_density_2d(aes(fill = ..level..), geom="polygon", alpha=0.2) + scale_fill_gradient(low="green", high="red")
sp + theme_classic()
zoom_sp <- sp + coord_cartesian(xlim = c(0, 2), ylim = c(0, 0.2))
zoom_sp + theme_classic()
kd <- with(dataset1, MASS::kde2d(committers, commits, n = 50))
fig <- plot_ly(x = kd$x, y = kd$y, z = kd$z) %>% add_surface()
fig
As preliminary analysis, we compute correlation values and draw a matrix of scatter plots:
dataset_only_data<-select(dataset1, -1, -2)
M <- cor(dataset_only_data)
plot(dataset_only_data)
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.6.3
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
corrgram(dataset_only_data, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="correlation between variables")
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
corrplot(M, method = "circle")
corrplot(M, method = "ellipse")
corrplot(M, method = "number")
col <- colorRampPalette(c("#77AA44", "#AADD77", "#FFFFFF", "#EE9988", "#BB4444"))
res1 <- cor.mtest(dataset_only_data, conf.level = .95)
corrplot(M, method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", # Add coefficient of correlation
tl.col = "black", tl.srt = 90, # Text label color and rotation
# Combine with significance
p.mat = res1$p, sig.level = 0.05, insig = "blank",
# hide correlation coefficient on the principal diagonal
diag = FALSE)
We focus on some variables where we observe certain correlation. First, we observe the behaviour of commits against complexity
library(ggplot2)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 3.6.3
## Loading required package: magrittr
sonarqube<-dataset1[(dataset1[,'project']=='sonarqube'), 1:12]
sp <- ggplot(sonarqube, aes(x=commits, y=bugs)) +
geom_point(shape=16, aes(colour=project)) +
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey") +
theme(legend.position = "bottom") +
stat_cor(method = "pearson")
sp
## `geom_smooth()` using formula 'y ~ x'
jacoco<-dataset1[(dataset1[,'project']=='jacoco'), 1:12]
sp <- ggplot(jacoco, aes(x=commits, y=code_smells)) +
geom_point(shape=16, aes(colour=project)) +
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey") +
stat_cor(method = "pearson") +
theme(legend.position = "bottom")
sp
## `geom_smooth()` using formula 'y ~ x'
monica<-dataset1[(dataset1[,'project']=='monica'), 1:12]
sp <- ggplot(monica, aes(x=changes_by_commit, y=code_smells)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey") +
stat_cor(method = "pearson") +
theme(legend.position = "bottom")
sp
## `geom_smooth()` using formula 'y ~ x'
ant<-dataset1[(dataset1[,'project']=='Ant-Media-Server'), 1:12]
sp <- ggplot(ant, aes(x=committers, y=complexity)) +
geom_point(shape=16, aes(colour=project)) +
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey") +
stat_cor(method = "pearson") +
theme(legend.position = "bottom")
sp
## `geom_smooth()` using formula 'y ~ x'
ggplot(dataset1, aes(x=commits, y=complexity)) +
geom_point(aes(colour=project))
ggplot(dataset1, aes(x=commits, y=complexity, colour=project)) +
geom_point(shape=16)+
geom_smooth(se = FALSE, method = lm)
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1, aes(x=commits, y=complexity)) +
geom_point(shape=16, aes(colour=project))+
geom_smooth(method=lm, linetype="dashed",
color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
zoom_sp <- sp + coord_cartesian(xlim = c(0, 0.2), ylim = c(-20000, 20000))
zoom_sp
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1, aes(x=commits, y=complexity)) +
geom_point(aes(colour=project))
sp + geom_density_2d()
sp + stat_density_2d(aes(fill = ..level..), geom="polygon") + scale_fill_gradient(low="green", high="red")
committers_density <- ggplot(dataset1, aes(x=committers, fill=project)) +
geom_density(aes(group = project,
colour = project,
fill = project),
alpha=.1) +
theme(legend.position = "right")
committers_density
zoom_sp <- committers_density + coord_cartesian(xlim = c(0, 0.10), ylim = c(0, 75))
zoom_sp
commits_density <- ggplot(dataset1, aes(x=commits, fill=project)) +
geom_density(aes(group = project,
colour = project,
fill = project),
alpha=.1) +
theme(legend.position = "right")
commits_density
zoom_sp <- commits_density + coord_cartesian(xlim = c(0, 0.3), ylim = c(0, 10))
zoom_sp
We carry out a hierarchical clustering with all the variables and take 4 clusters
ddata1 <- dist(dataset_only_data)
gdata1 <- hclust(ddata1, method = "centroid")
plot(gdata1, sub = "example", xlab = "cases", ylab = "high")
rect.hclust(tree = gdata1, k = 4, border = c("red", "blue", "green", "orange"))
clusters <- cutree(tree = gdata1, k = 4)
clusters
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4
## [149] 4 1 1 1 1 1 1 1
dataset_clusters <- dataset_only_data
dataset_clusters$cluster <- factor(clusters)
##K-means scaled values We carry out a K-means clustering with all the variables scaled and considering 4 clusters
library(cluster)
## Warning: package 'cluster' was built under R version 3.6.3
#Method for determine best number of clusters in K-means. Look for a bend or elbow in the sum of squared error (SSE) scree plot
mydata <- dataset_only_data
wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(mydata,
centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
zdata1 <- scale(dataset_only_data)
kcdata1 <- kmeans(x = zdata1, centers = 4)
kcdata1$cluster
## [1] 4 4 4 2 4 4 4 4 2 2 4 2 2 4 2 2 4 2 2 2 2 2 4 2 2 4 4 4 2 4 4 2 2 2 2 2 2
## [38] 4 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 4 2 4 4 4 4 4 4 2 2 2 2 4 4
## [75] 4 4 4 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2 4 2 1 1
## [149] 1 3 3 3 3 3 3 3
clusplot(zdata1, kcdata1$cluster, color = TRUE, shade = TRUE,
labels = 2, lines = 0)
dataset_clusters$cluster2 <- factor(kcdata1$cluster)
dataset1_cluster <- dataset1
dataset1_cluster$cluster <- factor(kcdata1$cluster)
We performed the characterization of clusters for the k-means algorithm
par(mfrow=c(1,1))
library(lattice)
##
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
##
## panel.fill
splom(~ dataset_clusters[1:9], groups = cluster2, data = dataset_clusters, pch = 16)
library(vioplot)
## Warning: package 'vioplot' was built under R version 3.6.3
## Loading required package: sm
## Warning: package 'sm' was built under R version 3.6.3
## Package 'sm', version 2.2-5.6: type help(sm) for summary information
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.6.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
x1 <- dataset_clusters$code_smells[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster2==4]
# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Code smells per cluster")
x1 <- dataset_clusters$commits[dataset_clusters$cluster2==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster2==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster2==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster2==4]
# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Commits per cluster")
We compute correlation and scatter plots for clusters
library(corrplot)
c1<-dataset_clusters[(dataset_clusters[,'cluster2']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster2']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster2']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster2']=='4'), 1:10]
corrplot(cor(c1), method="number")
corrplot(cor(c2), method="number")
corrplot(cor(c3), method="number")
corrplot(cor(c4), method="number")
col <- colorRampPalette(c("#77AA44", "#AADD77", "#FFFFFF", "#EE9988", "#BB4444"))
res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
sp <- ggplot(dataset1_cluster, aes(x=commits, y=code_smells, colour=cluster, shape=cluster)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")+
coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 45))
sp
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1_cluster, aes(x=committers, y=code_smells, colour=cluster, shape=cluster)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey") +
coord_cartesian(xlim = c(0, 0.3), ylim = c(0, 50))
sp
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1_cluster, aes(x=commits, y=complexity, colour=cluster, shape=cluster)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
sp <- ggplot(dataset1_cluster, aes(x=committers, y=complexity, colour=cluster, shape=cluster)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
Some 3d plots with correlations of several measures by cluster
##Ploting for sonarqube project, cluster 2 and 3 differences
Ploting for sonarqube project, cluster 2 and 3 differences
library(ggplot2)
library(ggpubr)
theme_set(theme_minimal())
dataset1_cluster
## # A tibble: 156 x 13
## project version commits committers changes_by_comm~ committers_weig~ bugs
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 monica master 0.0227 0.112 178. 0.0151 0.576
## 2 monica 2.16.0 0.0306 0.254 228. 0.0091 0.553
## 3 monica 2.15.2 0.0222 0.198 180. 0.0153 0.638
## 4 monica 2.15.1 0.00119 0.0154 10.8 1 1.39
## 5 monica 2.15.0 0.0633 0.324 136. 0.0059 1.39
## 6 monica 2.13.0 0.0116 0.164 178. 0.0287 0.738
## 7 monica 2.12.1 0.0128 0.165 225. 0.0351 0.760
## 8 monica 2.12.0 0.0293 0.199 2097. 0.0132 0.764
## 9 monica 2.11.2 0.00116 0.0361 93.5 0.221 0.553
## 10 monica 2.11.1 0.00233 0.0362 1150. 0.147 0.555
## # ... with 146 more rows, and 6 more variables: code_smells <dbl>,
## # complexity <dbl>, violations <dbl>, duplicated_lines <dbl>,
## # open_issues <dbl>, cluster <fct>
ant<-dataset1_cluster[(dataset1_cluster[,'project']=='Ant-Media-Server'), 1:13]
p_code_smells<-ggplot(dat = ant, aes(x=version, y=code_smells)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
theme(axis.text.x=element_blank())
p_code_smells
p_bugs<-ggplot(dat = ant, aes(x=version, y=bugs)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
theme(axis.text.x=element_blank())
p_bugs
p_cloning<-ggplot(dat = ant, aes(x=version, y=duplicated_lines)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
theme(axis.text.x=element_blank())
p_cloning
p_violations<-ggplot(dat = ant, aes(x=version, y=violations)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3)+
theme(axis.text.x=element_blank())
p_violations
p_committers<-ggplot(dat = ant, aes(x=version, y=committers)) +
geom_line(aes(group=1)) +
geom_point(aes(colour=cluster, shape=cluster, group=cluster), size=3) +
theme(axis.text.x=element_blank())
p_committers
figure <- ggarrange(p_code_smells, p_cloning, p_committers, labels = c("a", "b", "c"), ncol = 1, nrow = 3) + theme(axis.text.x=element_blank())
figure
##k-means for normalized values
we perform the kmeans algorithm with normalized values and euclidean distance
library(vegan)
## Warning: package 'vegan' was built under R version 3.6.3
## Loading required package: permute
## Warning: package 'permute' was built under R version 3.6.3
## Registered S3 methods overwritten by 'vegan':
## method from
## reorder.hclust seriation
## rev.hclust dendextend
## This is vegan 2.5-6
library(permute)
#data normalization
spe.norm <- decostand(dataset_only_data, "normalize")
spe.ch <- vegdist(spe.norm, "euc")
spe.ch.ward <- hclust(spe.ch, method = "ward.D")
plot(spe.ch.ward, sub = "Ward method")
#Calinski method
spe.KM.cascade <- cascadeKM(spe.norm, inf.gr = 2, sup.gr = 10, iter = 1000, criterion = "ssi")
spe.KM.cascade$results
## 2 groups 3 groups 4 groups 5 groups 6 groups 7 groups 8 groups
## SSE 2.30746283 0.6965684 0.45755324 0.32813874 0.19934571 0.14242250 0.11631444
## ssi 0.02105365 0.0220436 0.02226886 0.02693511 0.02217787 0.02252665 0.02247974
## 9 groups 10 groups
## SSE 0.09939645 0.09112660
## ssi 0.02574863 0.02797884
plot(spe.KM.cascade, sortg = TRUE)
#Silhouette plot
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
dissE <- daisy(spe.norm)
sk <- silhouette(spe.kmeans$cl, dissE)
plot(sk)
#compute k-means
set.seed(1)
spe.kmeans <- kmeans(spe.norm, centers = 4, nstart = 100)
#clusters plot
spebc.ward.g <- cutree(spe.ch.ward,k = 4)
table(spe.kmeans$cluster, spebc.ward.g)
## spebc.ward.g
## 1 2 3 4
## 1 0 0 11 0
## 2 33 76 0 0
## 3 24 0 7 0
## 4 0 0 0 5
clusplot(spe.norm, spe.kmeans$cluster, color = TRUE, shade = TRUE,
labels = 2, lines = 0)
dataset_clusters$cluster3 <- factor(spe.kmeans$cluster)
We performed the characterization of clusters for the k-means algorithm
par(mfrow=c(1,1))
library(lattice)
splom(~ dataset_clusters[1:9], groups = cluster3, data = dataset_clusters, pch = 16)
library(vioplot)
x1 <- dataset_clusters$code_smells[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$code_smells[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$code_smells[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$code_smells[dataset_clusters$cluster3==4]
# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Code smells per cluster")
x1 <- dataset_clusters$commits[dataset_clusters$cluster3==1]
x2 <- dataset_clusters$commits[dataset_clusters$cluster3==2]
x3 <- dataset_clusters$commits[dataset_clusters$cluster3==3]
x4 <- dataset_clusters$commits[dataset_clusters$cluster3==4]
# vioplot(x1, x2, x3, names=c("1", "2", "3"), col="grey")
vioplot(x1, x2, x3, x4, names=c("1", "2", "3", "4"), col="grey")
title("Commits per cluster")
We compute correlation and scatter plots for clusters
c1<-dataset_clusters[(dataset_clusters[,'cluster3']=='1'), 1:10]
c2<-dataset_clusters[(dataset_clusters[,'cluster3']=='2'), 1:10]
c3<-dataset_clusters[(dataset_clusters[,'cluster3']=='3'), 1:10]
c4<-dataset_clusters[(dataset_clusters[,'cluster3']=='4'), 1:10]
corrplot(cor(c1), method="number")
corrplot(cor(c2), method="number")
corrplot(cor(c3), method="number")
corrplot(cor(c4), method="number")
col <- colorRampPalette(c("#77AA44", "#AADD77", "#FFFFFF", "#EE9988", "#BB4444"))
res1 <- cor.mtest(c1, conf.level = .95)
corrplot(cor(c1), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c2, conf.level = .95)
corrplot(cor(c2), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c3, conf.level = .95)
corrplot(cor(c3), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
res1 <- cor.mtest(c4, conf.level = .95)
corrplot(cor(c4), method = "color", col = col(200),
type = "upper", order = "original", number.cex = .8,
addCoef.col = "black", tl.col = "black", tl.srt = 90,
p.mat = res1$p, sig.level = 0.05, insig = "blank", diag = FALSE)
sp <- ggplot(dataset_clusters, aes(x=commits, y=complexity, colour=cluster3, shape=cluster3)) +
geom_point()+
geom_smooth(method=lm, linetype="dashed", color="darkred", fill="grey")
sp
## `geom_smooth()` using formula 'y ~ x'
Some 3d plots with correlations of several measures by cluster